Code
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(gtsummary)
library(reshape2)
library(RColorBrewer)
library(reshape2)
library(ggplot2)
library(caret)The HINTS survey includes 527 questions. To focus on sentiment toward healthcare and cancer, we selected the questions in Table 1. In Table 1, participants were asked various questions about where they access health information, do they trust the provided health information, and if they feel frustrated about the information.
Table 1
| Variable | Description |
|---|---|
| SeekCancerInfo | Have you ever looked for information about cancer from any source? |
| CancerFrustrated | Based on the results of your most recent search for information about cancer, how much do you agree or disagree: You felt frustrated during your search for the information. |
| CancerTrustDoctor | In general, how much would you trust information about cancer from a doctor? |
| CancerTrustFamily | In general, how much would you trust information about cancer from family or friends? |
| CancerTrustGov | In general, how much would you trust information about cancer from government health agencies? |
| CancerTrustCharities | In general, how much would you trust information about cancer from charitable organizations? |
| CancerTrustReligiousOrgs | In general, how much would you trust information about cancer from religious organizations and leaders? |
| CancerTrustScientists | In general, how much would you trust information about cancer from scientists? |
| Electronic2_HealthInfo | In the past 12 months have you used the Internet to look for health or medical information? |
| MisleadingHealthInfo | How much of the health information that you see on social media do you think is false or misleading? |
| TrustHCSystem | How much do you trust the health care system (for example, hospitals, pharmacies, and other organizations involved in health care)? |
file_path <- '../data/csv/hints_cleaned_forML_spearman.csv'
hints_cleaned <- read.csv(file_path)
# Drop the 'SeekCancerInfo' column from hints_cleaned
hints_cleaned <- hints_cleaned %>%
select(-SeekCancerInfo, -HHID)
# Drop rows with NA values
hints_cleaned <- na.omit(hints_cleaned)
# print(head(hints_cleaned))
# print(paste("Shape of the dataframe:", paste(dim(hints_cleaned), collapse = " x ")))# Converting to numeric
library(dplyr)
# Define the mappings
trust_mapping <- c("Not at all" = 4, "A little" = 3, "Some" = 2, "A lot" = 1)
agreement_mapping <- c("Strongly agree" = 1, "Somewhat agree" = 2, "Somewhat disagree" = 3, "Strongly disagree" = 4)
binary_mapping <- c("Yes" = 1, "No" = 2)
misleading_info_mapping <- c("I do not use social media" = 5, "None" = 4, "A little" = 3, "Some" = 2, "A lot" = 1)
# Apply the mappings and transformations
hints_cleaned <- hints_cleaned %>%
filter(!is.na(MisleadingHealthInfo)) %>%
mutate(
CancerFrustrated = recode(CancerFrustrated, !!!agreement_mapping),
CancerTrustDoctor = recode(CancerTrustDoctor, !!!trust_mapping),
CancerTrustFamily = recode(CancerTrustFamily, !!!trust_mapping),
CancerTrustGov = recode(CancerTrustGov, !!!trust_mapping),
CancerTrustCharities = recode(CancerTrustCharities, !!!trust_mapping),
CancerTrustReligiousOrgs = recode(CancerTrustReligiousOrgs, !!!trust_mapping),
CancerTrustScientists = recode(CancerTrustScientists, !!!trust_mapping),
TrustHCSystem = recode(TrustHCSystem, !!!trust_mapping),
Electronic2_HealthInfo = recode(Electronic2_HealthInfo, !!!binary_mapping),
MisleadingHealthInfo = recode(MisleadingHealthInfo, !!!misleading_info_mapping)
)
# Display the transformed dataset
# cat("Data after applying mappings to numeric values:\n")
# print(head(hints_cleaned, n = 5), digits = 2)
# Display the data types of the columns
# cat("\nColumn types:\n")
# str(hints_cleaned)# Standardize the data (excluding non-numeric columns)
numeric_columns <- sapply(hints_cleaned, is.numeric)
standardized_data <- hints_cleaned[, numeric_columns] %>%
scale() # Standardize the numeric columns
# Convert the standardized data back to a data frame
standardized_data <- as.data.frame(standardized_data)
# Define target variable
target_variable <- 'TrustHCSystem' # Replace with your actual target column name
# Check if the target variable is in the standardized data
if (!(target_variable %in% colnames(standardized_data))) {
stop(paste("Target variable '", target_variable, "' not found in the dataset.", sep = ""))
}
# Compute Spearman correlation matrix
correlation_matrix_spearman <- cor(standardized_data, method = "spearman")
# Rename the matrix to correlation_data
correlation_data <- correlation_matrix_spearman
# Display the full Spearman correlation matrix
# print("Spearman Correlation Matrix (excluding SeekCancerInfo and after standardization):")
# print(correlation_data)
# Focus on the correlation of the target variable with other features
correlation_with_target_spearman <- correlation_data[, target_variable] %>%
sort(decreasing = TRUE)
# print(paste("\nSpearman correlation of features with", target_variable, ":"))
# print(correlation_with_target_spearman)# Convert the correlation matrix into a long format for ggplot
melted_correlation <- melt(correlation_matrix_spearman)
# Mask the upper triangle and diagonal of the correlation matrix
melted_correlation$value[upper.tri(correlation_matrix_spearman, diag = TRUE)] <- NA
# Create a custom color scale
custom_colors <- colorRampPalette(c("#ffffff", "#3d6469"))(100)
# Plot the heatmap
ggplot(melted_correlation, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradientn(colours = custom_colors,
limits = c(-1, 1),
na.value = "white",
name = "Spearman\nCorrelation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.text.y = element_text(angle = 0, hjust = 1),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "right") +
labs(title = "Spearman Correlation Matrix (Ordinal Data)") +
coord_fixed() +
geom_text(aes(label = ifelse(is.na(value), "", sprintf("%.2f", value))),
color = "black", size = 2.5)load('../data/HINTS6_R_20240524/hints6_public.rda')
hints <- as.data.frame(public)
#print(colnames(hints))
columns <- c("HHID", "updatedate", "SeekCancerInfo", "CancerFrustrated", "CancerTrustDoctor", "CancerTrustFamily", "CancerTrustGov", "CancerTrustCharities", "CancerTrustReligiousOrgs", "CancerTrustScientists", "Electronic2_HealthInfo", "MisleadingHealthInfo", "TrustHCSystem")
hints_select <- hints %>% select(all_of(columns))
#hints_select$updatedate <- hints_select$updatedate / 1000
#hints_select$updatedate <- as_datetime(hints_select$updatedate)
head(hints_select) HHID updatedate SeekCancerInfo CancerFrustrated
1 21000006 13870396800 No Inapplicable, coded 2 in SeekCancerInfo
2 21000009 13874630400 No Inapplicable, coded 2 in SeekCancerInfo
3 21000020 13873680000 Yes Somewhat disagree
4 21000022 13867891200 No Inapplicable, coded 2 in SeekCancerInfo
5 21000039 13866336000 No Inapplicable, coded 2 in SeekCancerInfo
6 21000043 13866595200 No Inapplicable, coded 2 in SeekCancerInfo
CancerTrustDoctor CancerTrustFamily
1 A lot Missing data (Not Ascertained)
2 A lot Some
3 A lot Some
4 A lot Missing data (Not Ascertained)
5 Some Some
6 A lot Some
CancerTrustGov CancerTrustCharities
1 Missing data (Not Ascertained) Missing data (Not Ascertained)
2 A lot Some
3 Some A little
4 Missing data (Not Ascertained) Missing data (Not Ascertained)
5 Some Not at all
6 Some A lot
CancerTrustReligiousOrgs CancerTrustScientists
1 Missing data (Not Ascertained) Missing data (Not Ascertained)
2 Some A lot
3 Not at all A lot
4 Missing data (Not Ascertained) Missing data (Not Ascertained)
5 Not at all Some
6 A little A lot
Electronic2_HealthInfo MisleadingHealthInfo
1 Question answered in error (Commission Error) I do not use social media
2 Yes I do not use social media
3 Yes Some
4 Inapplicable, coded 2 in UseInternet I do not use social media
5 Yes A lot
6 Yes A lot
TrustHCSystem
1 Very
2 Very
3 Somewhat
4 Somewhat
5 Somewhat
6 A little
In the bar graphs, a first look at the data provides an general overview of the responses to the questions. These plots show how much participants agree or disagree to each question. For example, many participants can trust doctors a lot and less than family members.
[1] Missing data (Not Ascertained)
[2] Missing data (Filter Missing)
[3] Multiple responses selected in error
[4] Question answered in error (Commission Error)
[5] Inapplicable, coded 2 in SeekCancerInfo
[6] Strongly agree
[7] Somewhat agree
[8] Somewhat disagree
[9] Strongly disagree
[10] A lot
[11] Some
[12] A little
[13] Not at all
[14] Yes
[15] No
[16] Inapplicable, coded 2 in UseInternet
[17] Missing data (Web partial - Question Never Seen)
[18] None
[19] I do not use social media
[20] Very
[21] Somewhat
21 Levels: Missing data (Not Ascertained) Yes ... Somewhat
values <- c("Strongly agree", "Somewhat agree", "Somewhat disagree", "Strongly disagree", "A lot", "Some",
"A little", "Not at all", "Yes", "No", "None", "I do not use social media", "Very", "Somewhat")
plot_data_filtered <- plot_data %>% filter(Value %in% values)
plot_data_filtered$Value <- factor(plot_data_filtered$Value, levels = sort(unique(plot_data_filtered$Value)))
columns_1 <- c("SeekCancerInfo", "CancerFrustrated", "CancerTrustDoctor", "CancerTrustFamily")
plot_data_filtered_1 <- plot_data_filtered %>% filter(Variable %in% columns_1)
columns_2 <- c("CancerTrustGov", "CancerTrustCharities", "CancerTrustReligiousOrgs", "CancerTrustScientists")
plot_data_filtered_2 <- plot_data_filtered %>% filter(Variable %in% columns_2)
columns_3 <- c("Electronic2_HealthInfo", "MisleadingHealthInfo", "TrustHCSystem")
plot_data_filtered_3 <- plot_data_filtered %>% filter(Variable %in% columns_3)
p <- ggplot(plot_data_filtered_1, aes(x = Value, y = n, fill = Variable)) +
geom_bar(stat = "identity") +
facet_wrap(~ Variable, scales = "free_x") + # Separate plots for each column
theme_minimal() +
labs(
title = "HINTS Survey Responses",
x = "Responses",
y = "Count",
fill = "Question"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pquartz_off_screen
2
p <- ggplot(plot_data_filtered_2, aes(x = Value, y = n, fill = Variable)) +
geom_bar(stat = "identity") +
facet_wrap(~ Variable, scales = "free_x") +
theme_minimal() +
labs(
title = "HINTS Survey Responses",
x = "Responses",
y = "Count",
fill = "Question"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pquartz_off_screen
2
p <- ggplot(plot_data_filtered_3, aes(x = Value, y = n, fill = Variable)) +
geom_bar(stat = "identity") +
facet_wrap(~ Variable, scales = "free_x", nrow=2) +
theme_minimal() +
labs(
title = "HINTS Survey Responses",
x = "Responses",
y = "Count",
fill = "Question"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pquartz_off_screen
2
The responses of the questions can be coded into a scale from 0:3. For example, “Not at all” is coded as 0, “A little” is coded as 1, “Some” is coded as 2, and “A lot” is coded as 3. After coding these responses, the mean, median, and mode is calculated to highlight overall trends in the data.
[1] I do not use social media
[2] Some
[3] A lot
[4] A little
[5] None
[6] Missing data (Not Ascertained)
[7] Missing data (Web partial - Question Never Seen)
7 Levels: Missing data (Not Ascertained) ...
[1] "HHID" "updatedate"
[3] "SeekCancerInfo" "CancerFrustrated"
[5] "CancerTrustDoctor" "CancerTrustFamily"
[7] "CancerTrustGov" "CancerTrustCharities"
[9] "CancerTrustReligiousOrgs" "CancerTrustScientists"
[11] "Electronic2_HealthInfo" "MisleadingHealthInfo"
[13] "TrustHCSystem"
hints_select_coded <- hints_select %>%
mutate(CancerFrustrated = as.numeric(case_when(
CancerFrustrated == "Strongly disagree" ~ "0",
CancerFrustrated == "Somewhat disagree" ~ "1",
CancerFrustrated == "Somewhat agree" ~ "2",
CancerFrustrated == "Strongly agree" ~ "3",
TRUE ~ NA
)))
hints_select_coded <- hints_select_coded %>%
mutate(CancerTrustDoctor = as.numeric(case_when(
CancerTrustDoctor == "Not at all" ~ "0",
CancerTrustDoctor == "A little" ~ "1",
CancerTrustDoctor == "Some" ~ "2",
CancerTrustDoctor == "A lot" ~ "3",
TRUE ~ NA
)))
hints_select_coded <- hints_select_coded %>%
mutate(CancerTrustFamily = as.numeric(case_when(
CancerTrustFamily == "None" ~ "0",
CancerTrustFamily == "A little" ~ "1",
CancerTrustFamily == "Some" ~ "2",
CancerTrustFamily == "A lot" ~ "3",
TRUE ~ NA
)))
hints_select_coded <- hints_select_coded %>%
mutate(SeekCancerInfo = as.numeric(case_when(
SeekCancerInfo == "Yes" ~ "1",
SeekCancerInfo == "No" ~ "0",
TRUE ~ NA
)))
hints_select_coded <- hints_select_coded %>%
mutate(CancerTrustCharities = as.numeric(case_when(
CancerTrustCharities == "Not at all" ~ "0",
CancerTrustCharities == "A little" ~ "1",
CancerTrustCharities == "Some" ~ "2",
CancerTrustCharities == "A lot" ~ "3",
TRUE ~ NA
)))
hints_select_coded <- hints_select_coded %>%
mutate(CancerTrustGov = as.numeric(case_when(
CancerTrustGov == "Not at all" ~ "0",
CancerTrustGov == "A little" ~ "1",
CancerTrustGov == "Some" ~ "2",
CancerTrustGov == "A lot" ~ "3",
TRUE ~ NA
)))
hints_select_coded <- hints_select_coded %>%
mutate(CancerTrustReligiousOrgs = as.numeric(case_when(
CancerTrustReligiousOrgs == "Not at all" ~ "0",
CancerTrustReligiousOrgs == "A little" ~ "1",
CancerTrustReligiousOrgs == "Some" ~ "2",
CancerTrustReligiousOrgs == "A lot" ~ "3",
TRUE ~ NA
)))
hints_select_coded <- hints_select_coded %>%
mutate(CancerTrustScientists = as.numeric(case_when(
CancerTrustScientists == "Not at all" ~ "0",
CancerTrustScientists == "A little" ~ "1",
CancerTrustScientists == "Some" ~ "2",
CancerTrustScientists == "A lot" ~ "3",
TRUE ~ NA
)))
hints_select_coded <- hints_select_coded %>%
mutate(Electronic2_HealthInfo = as.numeric(case_when(
Electronic2_HealthInfo == "Yes" ~ "1",
Electronic2_HealthInfo == "No" ~ "0",
TRUE ~ NA
)))
hints_select_coded <- hints_select_coded %>%
mutate(MisleadingHealthInfo = as.numeric(case_when(
MisleadingHealthInfo == "None" ~ "0",
MisleadingHealthInfo == "I do not use social media" ~ "0",
MisleadingHealthInfo == "A little" ~ "1",
MisleadingHealthInfo == "Some" ~ "2",
MisleadingHealthInfo == "A lot" ~ "3",
TRUE ~ NA
)))
hints_select_coded <- hints_select_coded %>%
mutate(TrustHCSystem = as.numeric(case_when(
TrustHCSystem == "Not at all" ~ "0",
TrustHCSystem == "A little" ~ "1",
TrustHCSystem == "Somewhat" ~ "2",
TrustHCSystem == "Very" ~ "3",
TRUE ~ NA
)))
print(head(hints_select_coded)) HHID updatedate SeekCancerInfo CancerFrustrated CancerTrustDoctor
1 21000006 13870396800 0 NA 3
2 21000009 13874630400 0 NA 3
3 21000020 13873680000 1 1 3
4 21000022 13867891200 0 NA 3
5 21000039 13866336000 0 NA 2
6 21000043 13866595200 0 NA 3
CancerTrustFamily CancerTrustGov CancerTrustCharities
1 NA NA NA
2 2 3 2
3 2 2 1
4 NA NA NA
5 2 2 0
6 2 2 3
CancerTrustReligiousOrgs CancerTrustScientists Electronic2_HealthInfo
1 NA NA NA
2 2 3 1
3 0 3 1
4 NA NA NA
5 0 2 1
6 1 3 1
MisleadingHealthInfo TrustHCSystem
1 0 3
2 0 3
3 2 2
4 0 2
5 3 2
6 3 1
In the summary table below, the mean for trusting a doctor is higher than trusting the government. Given this information, we will also look at the Reddit dataset to see the level of trust users have when they mention the government in their comments versus doctors. In addition, the people who felt frustrated about the information they received about cancer is approximately 1.105. In the Reddit dataset, we also look for an equivalent using textual data by looking at positive/negative and emotion sentiment analysis on comments that include the word “cancer”.
HHID updatedate SeekCancerInfo CancerFrustrated
Length:6252 Min. :1.387e+10 Min. :0.0000 Min. :0.000
Class :character 1st Qu.:1.387e+10 1st Qu.:0.0000 1st Qu.:0.000
Mode :character Median :1.387e+10 Median :0.0000 Median :1.000
Mean :1.387e+10 Mean :0.4654 Mean :1.105
3rd Qu.:1.387e+10 3rd Qu.:1.0000 3rd Qu.:2.000
Max. :1.389e+10 Max. :1.0000 Max. :3.000
NA's :17 NA's :3420
CancerTrustDoctor CancerTrustFamily CancerTrustGov CancerTrustCharities
Min. :0.000 Min. :1.000 Min. :0.00 Min. :0.000
1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:1.000
Median :3.000 Median :2.000 Median :2.00 Median :1.000
Mean :2.656 Mean :1.678 Mean :1.92 Mean :1.403
3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:3.00 3rd Qu.:2.000
Max. :3.000 Max. :3.000 Max. :3.00 Max. :3.000
NA's :94 NA's :783 NA's :273 NA's :308
CancerTrustReligiousOrgs CancerTrustScientists Electronic2_HealthInfo
Min. :0.0000 Min. :0.000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.0000
Median :1.0000 Median :3.000 Median :1.0000
Mean :0.9484 Mean :2.357 Mean :0.8534
3rd Qu.:2.0000 3rd Qu.:3.000 3rd Qu.:1.0000
Max. :3.0000 Max. :3.000 Max. :1.0000
NA's :280 NA's :218 NA's :1130
MisleadingHealthInfo TrustHCSystem
Min. :0.000 Min. :0.0
1st Qu.:1.000 1st Qu.:2.0
Median :2.000 Median :2.0
Mean :1.716 Mean :2.2
3rd Qu.:3.000 3rd Qu.:3.0
Max. :3.000 Max. :3.0
NA's :82 NA's :134
| Characteristic | N = 2,3151 |
|---|---|
| SeekCancerInfo | |
| 1 | 2,315 (100%) |
| CancerFrustrated | |
| 0 | 759 (33%) |
| 1 | 780 (34%) |
| 2 | 624 (27%) |
| 3 | 152 (6.6%) |
| CancerTrustDoctor | |
| 0 | 11 (0.5%) |
| 1 | 72 (3.1%) |
| 2 | 441 (19%) |
| 3 | 1,791 (77%) |
| CancerTrustFamily | |
| 1 | 970 (42%) |
| 2 | 1,225 (53%) |
| 3 | 120 (5.2%) |
| CancerTrustGov | |
| 0 | 133 (5.7%) |
| 1 | 357 (15%) |
| 2 | 1,065 (46%) |
| 3 | 760 (33%) |
| CancerTrustCharities | |
| 0 | 251 (11%) |
| 1 | 798 (34%) |
| 2 | 1,086 (47%) |
| 3 | 180 (7.8%) |
| CancerTrustReligiousOrgs | |
| 0 | 907 (39%) |
| 1 | 832 (36%) |
| 2 | 494 (21%) |
| 3 | 82 (3.5%) |
| CancerTrustScientists | |
| 0 | 41 (1.8%) |
| 1 | 165 (7.1%) |
| 2 | 608 (26%) |
| 3 | 1,501 (65%) |
| Electronic2_HealthInfo | 2,203 (95%) |
| MisleadingHealthInfo | |
| 0 | 287 (12%) |
| 1 | 287 (12%) |
| 2 | 959 (41%) |
| 3 | 782 (34%) |
| TrustHCSystem | |
| 0 | 60 (2.6%) |
| 1 | 231 (10.0%) |
| 2 | 1,109 (48%) |
| 3 | 915 (40%) |
| 1 n (%) | |
The box plot provides a visualization of the median, mode, and outliers in the dataset.
boxplot_data <- hints_select_coded %>%
select(-HHID, -updatedate)
boxplot_data_long <- boxplot_data %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(boxplot_data_long, aes(x = Variable, y = Value)) +
geom_boxplot(outlier.colour = "red", outlier.size = 1) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "HINTS Boxplot",
x = "Variables",
y = "Values"
) + coord_cartesian(ylim = c(-1, 4)) Warning: Removed 6739 rows containing non-finite outside the scale range
(`stat_boxplot()`).
Using a correlation plot, the relationships are evaluated between the different survey questions. In the correlation plot below, we see an negative correlation between trust for doctors versus trusting the government, scientists, and the healthcare system. Further statistical testing can be performed to better understand this initial evaluation.
corrplot 0.95 loaded
Warning in cor(cor_matrix[, sapply(cor_matrix, is.numeric)], use =
"complete.obs"): the standard deviation is zero
The HINTS dataset provides insight into the perceptions of healthcare and cancer information. The trends in this dataset will be repeated in the Reddit dataset. Using the Reddit dataset, we will explore sentiments, such as positive/negative, frustrations, and trust. We will also look at word frequency count to review, which topics Reddit users commonly comment about.